Load the Required Packages:

Below, the packages required for data analysis and visualization are loaded.

library(tidyverse)
library(magrittr)
library(DBI)
library(dbplyr)
library(RMariaDB)
library(data.table)
library(stopwords)
library(tidytext)
library(RColorBrewer)
library(DT)
library(wordcloud)

State the Research Question:

W. Edwards Deming said, “In God we trust, all others must bring data.” Below, we will use data to explore the question, “Which are the most valued data science skills?”

Connect to the SQL Data Base:

con <- DBI::dbConnect(
  RMariaDB::MariaDB(),
  dbname = "dat_sci_jobs",
  username = "root",
  password = as.character(read.table("sql_db.txt", header = FALSE)),
  host = "35.227.102.234")

Read from the SQL Data Base and Disconnect:

tables <- dbListTables(con)
jobs_df <- dbReadTable(con, "_Jobs")
datatable(jobs_df[, -3], options = list(pageLength = 25))
dbDisconnect(con)

Create a Data Frame of Jobs Found via RSS Feed By Combining CSV Files and Removing Duplicates:

completed_files <- readLines("completed_files.txt")

url_base <- "https://raw.githubusercontent.com/geedoubledee/data607_project3/main/"

new_jobs_df <- as.data.frame(matrix(nrow = 0, ncol = 11))
files <- list.files(pattern = "_feeds_.*csv$")

for (i in 1:length(files)){
    if (!(files[i] %in% completed_files)){
        file <- paste(url_base, files[i], sep = "")
        csv <- read.csv(file = file, header = TRUE)
        new_jobs_df <- rbind(new_jobs_df, csv)
        completed_files <- append(completed_files, files[i])
    }
}

new_jobs_df2 <- as.data.frame(matrix(nrow = 0, ncol = 10))
files <- list.files(pattern = "_linkedin_.*csv$")

for (i in 1:length(files)){
    if (!(files[i] %in% completed_files)){
        file <- paste(url_base, files[i], sep = "")
        csv <- read.csv(file = file, header = TRUE)
        new_jobs_df2 <- rbind(new_jobs_df2, csv)
        completed_files <- append(completed_files, files[i])
    }
}

writeLines(completed_files, "completed_files.txt")

new_jobs_df <- new_jobs_df[!duplicated(new_jobs_df), ]
new_jobs_df2 <- new_jobs_df2[!duplicated(new_jobs_df2), ]

Remove Unnecessary Columns in Jobs Data Frame and Rearrange Remaining Columns to Prepare the Data for the SQL Data Base:

if (nrow(new_jobs_df) > 0){
    new_jobs_df <- subset(new_jobs_df, select = -c(X, author, summary,
        content, extracted_content_url, published, created_at))
    cols <- c("Job_id", "Site_id", "Job_title", "Job_url")
    colnames(new_jobs_df) <- cols
    Job_company <- as.data.frame(matrix(NA, nrow = nrow(new_jobs_df),
                                   ncol = 1))
    colnames(Job_company) <- "Job_company"
    Job_location <- as.data.frame(matrix(NA, nrow = nrow(new_jobs_df),
                                    ncol = 1))
    colnames(Job_location) <- "Job_location"
    new_jobs_df <- cbind(new_jobs_df, Job_company, Job_location)
    rownames(new_jobs_df) <- NULL
    new_jobs_df <- new_jobs_df[c("Job_id", "Job_title", "Job_url",
                                 "Job_company", "Job_location", "Site_id")]
    new_jobs_df %<>%
        mutate(Job_complete = 0)
}

if (nrow(new_jobs_df2) > 0){
    new_jobs_df2 <- subset(new_jobs_df2, select = -c(X, job_url, company_url,
        linkedin_company_url_cleaned, posted_date, normalized_company_name))
    cols <- c("Job_url", "Job_company", "Job_title", "Job_location")
    colnames(new_jobs_df2) <- cols
    Job_id <- as.data.frame(matrix(NA, nrow = nrow(new_jobs_df2),
                                   ncol = 1))
    colnames(Job_id) <- "Job_id"
    Site_id <- as.data.frame(matrix(1001, nrow = nrow(new_jobs_df2),
                                    ncol = 1))
    colnames(Site_id) <- "Site_id"
    new_jobs_df2 <- cbind(new_jobs_df2, Job_id, Site_id)
    rownames(new_jobs_df2) <- NULL
    new_jobs_df2 <- new_jobs_df2[c("Job_id", "Job_title", "Job_url",
                                   "Job_company", "Job_location", "Site_id")]
    new_jobs_df2 %<>%
        mutate(Job_complete = 0)
}

if (nrow(new_jobs_df) > 0){
    jobs_df <- rbindlist(list(jobs_df, new_jobs_df))[!duplicated(Job_url)]
}
if (nrow(new_jobs_df2) > 0){
    jobs_df <- rbindlist(list(jobs_df, new_jobs_df2))[!duplicated(Job_url)]
}
jobs_df %<>%
    mutate(Job_id = row_number())

Scrape Each Unique Job Listing URL:

for (i in 1:nrow(jobs_df)){
    httr::user_agent("Glen Davis")
    if (jobs_df[i, 7] == 0){
        dat <- try(xml2::read_html(jobs_df$Job_url[[i]]), silent = TRUE)
        if (inherits(dat, "try-error", which = FALSE)){
            jobs_df[i, 7] <- -1
            next
        }
    }else{
        next
    }
    if (jobs_df[i, 6] == 2594160){ #ai-jobs.net is source
        desc <- xml2::xml_find_all(
            dat, "//div[contains(@id, 'job-description')]")
    }
    else if (jobs_df[i, 6] == 977141){ #python.org is source
        desc <- xml2::xml_find_all(
            dat, "//div[contains(@class, 'job-description')]")
    }
    else if (jobs_df[i, 6] == 2594162){ #careercast it & eng is source
        desc <- xml2::xml_find_all(
            dat, "//div[contains(@class, 'arDetailDescriptionRow')]")
    }
    else if (jobs_df[i, 6] == 1378327){ #jobs for r-users is source
        desc <- xml2::xml_find_all(
            dat, "//div[contains(@class, 'section_content')]")
    }
    else if (jobs_df[i, 6] == 2593879){ #Indeed is source
        jobs_df[i, 7] <- -1
        next
    }
    else if (jobs_df[i, 6] == 2594166){ #Open Data Science is source
        desc <- xml2::xml_find_all(
            dat, "//div[contains(@class, 'job-desc')]")
    }
    else if (jobs_df[i, 6] == 2594174){ #MLconf is source
        desc <- xml2::xml_find_all(
            dat, "//div[contains(@class, 'job_description')]")
    }
    else if (jobs_df[i, 6] == 1001){ #Linkedin is source
        jobs_df[i, 7] <- -1
        next
    }
    
    desc <- xml2::xml_text(desc)
    fn <- paste(jobs_df[i, 1], ".txt", sep = "")
    writeLines(desc, con = fn)
    jobs_df[i, 7] <- 1
}

Manual Data Collection for LinkedIn:

manual <- jobs_df %>%
    filter(Job_complete == -1 & Site_id == 1001)
write.csv(manual, "manual.csv", row.names = FALSE)

We look up the job descriptions for the job listings in the manual.csv file manually, and we save them as column eight of a manual_edited.csv file. If we find a job description, we change the Job_complete value to 1. If we don’t, we can just delete that row. If a previous file exists, we save over it. We then upload the saved manual_edited.csv file to github before continuing.

file <- "https://raw.githubusercontent.com/geedoubledee/data607_project3/main/manual_edited.csv"

manual_edited <- read.csv(file = file, header = TRUE)
for (i in 1:nrow(manual_edited)){
    job_id <- manual_edited[i, 1]
    if (jobs_df[job_id, 7] != 1){
        job_desc <- manual_edited[i, 8]
        jobs_df[job_id, 7] <- manual_edited[i, 7]
        fn <- paste(job_id, ".txt", sep = "")
        writeLines(job_desc, con = fn)
    }
}

Add Previously Scraped (Nov. 2022) Data for Indeed Jobs (Source: Kaggle)

completed_files <- readLines("completed_files.txt")

if (!("data_science_jobs_indeed_usa.csv" %in% completed_files)){
    file <- "https://raw.githubusercontent.com/geedoubledee/data607_project3/main/data_science_jobs_indeed_usa.csv"
    kaggle_indeed <- read.csv(file = file, header = TRUE)
    kaggle_indeed <- subset(kaggle_indeed, select = -c(1, 5, 6, 7, 8))
    cols <- c("Job_title", "Job_company", "Job_location", "Job_url",
              "Job_description")
    colnames(kaggle_indeed) <- cols
    ids <- seq((nrow(jobs_df) + 1), (nrow(jobs_df) + nrow(kaggle_indeed)))
    Job_id <- as.data.frame(matrix(ids, nrow = nrow(kaggle_indeed),
                                   ncol = 1))
    Site_id <- as.data.frame(matrix(2593879, nrow = nrow(kaggle_indeed),
                                    ncol = 1))
    Job_complete <- as.data.frame(matrix(0, nrow = nrow(kaggle_indeed),
                                         ncol = 1))
    colnames(Site_id) <- "Site_id"
    colnames(Job_id) <- "Job_id"
    colnames(Job_complete) <- "Job_complete"
    kaggle_indeed <- cbind(kaggle_indeed, Job_id, Site_id, Job_complete)
    rownames(kaggle_indeed) <- NULL
    kaggle_indeed <- kaggle_indeed[c("Job_id", "Job_title", "Job_url",
                                     "Job_company", "Job_location",
                                     "Site_id", "Job_complete",
                                     "Job_description")]
    jobs_df <- rbind(jobs_df, subset(kaggle_indeed, select = 1:7))
    for (i in 1:nrow(kaggle_indeed)){
        job_id <- kaggle_indeed[i, 1]
        job_desc <- kaggle_indeed[i, 8]
        jobs_df[job_id, 7] <- 1
        fn <- paste(job_id, ".txt", sep = "")
        writeLines(job_desc, con = fn)
    }
    write("data_science_jobs_indeed_usa.csv", file = "completed_files.txt",
          append = TRUE)
}

Connect to the SQL Data Base:

con <- DBI::dbConnect(
  RMariaDB::MariaDB(),
  dbname = "dat_sci_jobs",
  username = "root",
  password = as.character(read.table("sql_db.txt", header = FALSE)),
  host = "35.227.102.234")

Write to the SQL Data Base Only If There Were Changes and Disconnect:

tables <- dbListTables(con)
copy <- dbReadTable(con, "_Jobs")

if (!identical(jobs_df, copy)){
    dbWriteTable(con, "_Jobs", jobs_df, overwrite = TRUE)
}

dbDisconnect(con)

Set Up a Text Data Frame from the TXT Job Description Files

files <- list.files(pattern = "^[1-9]+.*txt$")
if (length(files) > 0){
    file.copy(from = paste0(getwd(), "/", files),
              to = paste0(getwd(), "/jobs-txt/", files))
    file.remove(from = paste0(getwd(), "/", files))
}

files <- list.files(path = paste0(getwd(), "/jobs-txt/"),
                    pattern = "^[1-9]+.*txt$")

cols <- c("Text", "Job_id", "Line")

completed_txt_files <- readLines("completed_txt_files.txt")
if (length(completed_txt_files) == 0){
    text_df <- as.data.frame(matrix(nrow = 0, ncol = 3))
    colnames(text_df) <- cols
}else{
    my_url <- "https://raw.githubusercontent.com/geedoubledee/data607_project3/main/text_df.csv"
    text_df <- read.csv(file = my_url, header = TRUE,
                        fileEncoding = "UTF-8")
}

url_base <- "https://raw.githubusercontent.com/geedoubledee/data607_project3/main/jobs-txt/"

for (i in 1:length(files)){
    if (!files[i] %in% completed_txt_files){
        file <- paste(url_base, files[i], sep = "")
        job_id <- str_replace(files[i], ".txt", "")
        lines <- readLines(file)
        for (j in 1:length(lines)){
            col2 <- matrix(job_id, nrow = length(lines), ncol = 1)
            col3 <- matrix(1:length(lines),
                           nrow = length(lines),
                           ncol = 1)
        }
        addition <- cbind(lines, col2, col3)
        colnames(addition) <- cols
        text_df <- rbind(text_df, addition)
        write(files[i], file = "completed_txt_files.txt", append = TRUE)
    }
}

rownames(text_df) <- NULL
write.csv(text_df, "text_df.csv", row.names = FALSE)

Analyze the Text Data Frame with Tidytext

text_df_clean <- text_df
text_df_clean[, 1] <- tolower(text_df_clean[, 1])
text_df_clean %<>%
    filter(Text != "")

tidy_text_df_words <- text_df_clean %>%
    unnest_tokens(word, Text)

tidy_text_words_analysis <- tidy_text_df_words %>%
    anti_join(get_stopwords()) %>%
    group_by(word) %>%
    summarize(term_freq = n(),
              doc_count = n_distinct(Job_id),
              tf_dc_score = round((term_freq * doc_count / 1000000), 3)) %>%
    arrange(desc(tf_dc_score))
## Joining with `by = join_by(word)`
datatable(tidy_text_words_analysis, options = list(pageLength = 25))
## Warning in instance$preRenderHook(instance): It seems your data is too big for
## client-side DataTables. You may consider server-side processing:
## https://rstudio.github.io/DT/server.html
tidy_text_words_analysis %>%
    with(wordcloud(word, doc_count, max.words = 50))
## Warning in wordcloud(word, doc_count, max.words = 50): business could not be
## fit on page. It will not be plotted.
## Warning in wordcloud(word, doc_count, max.words = 50): development could not be
## fit on page. It will not be plotted.
## Warning in wordcloud(word, doc_count, max.words = 50): required could not be
## fit on page. It will not be plotted.
## Warning in wordcloud(word, doc_count, max.words = 50): ability could not be fit
## on page. It will not be plotted.
## Warning in wordcloud(word, doc_count, max.words = 50): degree could not be fit
## on page. It will not be plotted.
## Warning in wordcloud(word, doc_count, max.words = 50): team could not be fit on
## page. It will not be plotted.
## Warning in wordcloud(word, doc_count, max.words = 50): computer could not be
## fit on page. It will not be plotted.
## Warning in wordcloud(word, doc_count, max.words = 50): data could not be fit on
## page. It will not be plotted.
## Warning in wordcloud(word, doc_count, max.words = 50): skills could not be fit
## on page. It will not be plotted.
## Warning in wordcloud(word, doc_count, max.words = 50): opportunity could not be
## fit on page. It will not be plotted.
## Warning in wordcloud(word, doc_count, max.words = 50): environment could not be
## fit on page. It will not be plotted.
## Warning in wordcloud(word, doc_count, max.words = 50): technical could not be
## fit on page. It will not be plotted.
## Warning in wordcloud(word, doc_count, max.words = 50): experience could not be
## fit on page. It will not be plotted.
## Warning in wordcloud(word, doc_count, max.words = 50): support could not be fit
## on page. It will not be plotted.
## Warning in wordcloud(word, doc_count, max.words = 50): strong could not be fit
## on page. It will not be plotted.
## Warning in wordcloud(word, doc_count, max.words = 50): years could not be fit
## on page. It will not be plotted.
## Warning in wordcloud(word, doc_count, max.words = 50): analytics could not be
## fit on page. It will not be plotted.
## Warning in wordcloud(word, doc_count, max.words = 50): benefits could not be
## fit on page. It will not be plotted.
## Warning in wordcloud(word, doc_count, max.words = 50): information could not be
## fit on page. It will not be plotted.
## Warning in wordcloud(word, doc_count, max.words = 50): analysis could not be
## fit on page. It will not be plotted.

tidy_text_df_bigrams <- text_df_clean %>%
    unnest_tokens(bigram, Text, token = "ngrams", n = 2)

tidy_text_bigrams_analysis <- tidy_text_df_bigrams %>%
    separate(bigram, into = c("first","second"),
             sep = " ", remove = FALSE) %>%
    anti_join(stop_words, by = c("first" = "word")) %>%
    anti_join(stop_words, by = c("second" = "word")) %>%
    group_by(bigram) %>%
    summarize(term_freq = n(),
              doc_count = n_distinct(Job_id),
              tf_dc_score = round((term_freq * doc_count / 1000000), 3)) %>%
    filter(!is.na(bigram)) %>%
    arrange(desc(tf_dc_score))

datatable(tidy_text_bigrams_analysis, options = list(pageLength = 25))
## Warning in instance$preRenderHook(instance): It seems your data is too big for
## client-side DataTables. You may consider server-side processing:
## https://rstudio.github.io/DT/server.html
tidy_text_bigrams_analysis %>%
    with(wordcloud(bigram, doc_count, max.words = 50))
## Warning in wordcloud(bigram, doc_count, max.words = 50): opportunity employer
## could not be fit on page. It will not be plotted.
## Warning in wordcloud(bigram, doc_count, max.words = 50): data analysis could
## not be fit on page. It will not be plotted.
## Warning in wordcloud(bigram, doc_count, max.words = 50): cross functional could
## not be fit on page. It will not be plotted.
## Warning in wordcloud(bigram, doc_count, max.words = 50): computer science could
## not be fit on page. It will not be plotted.
## Warning in wordcloud(bigram, doc_count, max.words = 50): machine learning could
## not be fit on page. It will not be plotted.
## Warning in wordcloud(bigram, doc_count, max.words = 50): receive consideration
## could not be fit on page. It will not be plotted.
## Warning in wordcloud(bigram, doc_count, max.words = 50): data scientists could
## not be fit on page. It will not be plotted.
## Warning in wordcloud(bigram, doc_count, max.words = 50): data science could not
## be fit on page. It will not be plotted.
## Warning in wordcloud(bigram, doc_count, max.words = 50): related field could
## not be fit on page. It will not be plotted.
## Warning in wordcloud(bigram, doc_count, max.words = 50): job description could
## not be fit on page. It will not be plotted.
## Warning in wordcloud(bigram, doc_count, max.words = 50): color religion could
## not be fit on page. It will not be plotted.
## Warning in wordcloud(bigram, doc_count, max.words = 50): national origin could
## not be fit on page. It will not be plotted.
## Warning in wordcloud(bigram, doc_count, max.words = 50): bachelor's degree
## could not be fit on page. It will not be plotted.
## Warning in wordcloud(bigram, doc_count, max.words = 50): software development
## could not be fit on page. It will not be plotted.
## Warning in wordcloud(bigram, doc_count, max.words = 50): gender identity could
## not be fit on page. It will not be plotted.
## Warning in wordcloud(bigram, doc_count, max.words = 50): equal opportunity
## could not be fit on page. It will not be plotted.
## Warning in wordcloud(bigram, doc_count, max.words = 50): marital status could
## not be fit on page. It will not be plotted.
## Warning in wordcloud(bigram, doc_count, max.words = 50): orientation gender
## could not be fit on page. It will not be plotted.
## Warning in wordcloud(bigram, doc_count, max.words = 50): data scientist could
## not be fit on page. It will not be plotted.
## Warning in wordcloud(bigram, doc_count, max.words = 50): religion sex could not
## be fit on page. It will not be plotted.
## Warning in wordcloud(bigram, doc_count, max.words = 50): cutting edge could not
## be fit on page. It will not be plotted.

tidy_text_df_trigrams <- text_df_clean %>%
    unnest_tokens(trigram, Text, token = "ngrams", n = 3)

tidy_text_trigrams_analysis <- tidy_text_df_trigrams %>%
    separate(trigram, into = c("first","second","third"),
             sep = " ", remove = FALSE) %>%
    anti_join(stop_words, by = c("first" = "word")) %>%
    anti_join(stop_words, by = c("third" = "word")) %>%
    group_by(trigram) %>%
    summarize(term_freq = n(),
              doc_count = n_distinct(Job_id),
              tf_dc_score = round((term_freq * doc_count / 1000000), 3)) %>%
    filter(!is.na(trigram)) %>%
    arrange(desc(tf_dc_score))

datatable(tidy_text_trigrams_analysis, options = list(pageLength = 25))
## Warning in instance$preRenderHook(instance): It seems your data is too big for
## client-side DataTables. You may consider server-side processing:
## https://rstudio.github.io/DT/server.html
tidy_text_trigrams_analysis %>%
    with(wordcloud(trigram, doc_count, max.words = 50))
## Warning in wordcloud(trigram, doc_count, max.words = 50): equal opportunity
## employer could not be fit on page. It will not be plotted.
## Warning in wordcloud(trigram, doc_count, max.words = 50): equal employment
## opportunity could not be fit on page. It will not be plotted.
## Warning in wordcloud(trigram, doc_count, max.words = 50): affirmative action
## employer could not be fit on page. It will not be plotted.
## Warning in wordcloud(trigram, doc_count, max.words = 50): 8 hour shift could
## not be fit on page. It will not be plotted.
## Warning in wordcloud(trigram, doc_count, max.words = 50): hands on experience
## could not be fit on page. It will not be plotted.
## Warning in wordcloud(trigram, doc_count, max.words = 50): understanding of data
## could not be fit on page. It will not be plotted.
## Warning in wordcloud(trigram, doc_count, max.words = 50): sexual orientation
## gender could not be fit on page. It will not be plotted.
## Warning in wordcloud(trigram, doc_count, max.words = 50): sex sexual
## orientation could not be fit on page. It will not be plotted.
## Warning in wordcloud(trigram, doc_count, max.words = 50): ability to
## communicate could not be fit on page. It will not be plotted.
## Warning in wordcloud(trigram, doc_count, max.words = 50): degree in computer
## could not be fit on page. It will not be plotted.
## Warning in wordcloud(trigram, doc_count, max.words = 50): masterâ s degree
## could not be fit on page. It will not be plotted.
## Warning in wordcloud(trigram, doc_count, max.words = 50): identity or
## expression could not be fit on page. It will not be plotted.
## Warning in wordcloud(trigram, doc_count, max.words = 50): internal and external
## could not be fit on page. It will not be plotted.
## Warning in wordcloud(trigram, doc_count, max.words = 50): color religion sex
## could not be fit on page. It will not be plotted.
## Warning in wordcloud(trigram, doc_count, max.words = 50): consideration for
## employment could not be fit on page. It will not be plotted.
## Warning in wordcloud(trigram, doc_count, max.words = 50): products and services
## could not be fit on page. It will not be plotted.
## Warning in wordcloud(trigram, doc_count, max.words = 50): machine learning
## algorithms could not be fit on page. It will not be plotted.
## Warning in wordcloud(trigram, doc_count, max.words = 50): cross functional
## teams could not be fit on page. It will not be plotted.
## Warning in wordcloud(trigram, doc_count, max.words = 50): monday to friday
## could not be fit on page. It will not be plotted.
## Warning in wordcloud(trigram, doc_count, max.words = 50): written and verbal
## could not be fit on page. It will not be plotted.
## Warning in wordcloud(trigram, doc_count, max.words = 50): skills and experience
## could not be fit on page. It will not be plotted.
## Warning in wordcloud(trigram, doc_count, max.words = 50): computer science
## engineering could not be fit on page. It will not be plotted.
## Warning in wordcloud(trigram, doc_count, max.words = 50): natural language
## processing could not be fit on page. It will not be plotted.
## Warning in wordcloud(trigram, doc_count, max.words = 50): verbal and written
## could not be fit on page. It will not be plotted.
## Warning in wordcloud(trigram, doc_count, max.words = 50): bachelorâ s degree
## could not be fit on page. It will not be plotted.
## Warning in wordcloud(trigram, doc_count, max.words = 50): regard to race could
## not be fit on page. It will not be plotted.
## Warning in wordcloud(trigram, doc_count, max.words = 50): orientation gender
## identity could not be fit on page. It will not be plotted.